<!DOCTYPE html>
<html>
<head>
  <title>OCR PDFs and images directly in your browser</title>
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <script defer data-domain="tools.simonwillison.net" src="https://plausible.io/js/script.js"></script>
  <script type="module">
  import pdfjsDist from 'https://cdn.jsdelivr.net/npm/pdfjs-dist@4.2.67/+esm';
  pdfjsLib.GlobalWorkerOptions.workerSrc =
  "https://cdn.jsdelivr.net/npm/pdfjs-dist@4.2.67/build/pdf.worker.min.mjs";
  </script>
  <script src="https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.min.js"></script>
<style>
body {
  padding: 1em;
  font-family: helvetica, sans-serif;
  line-height: 1.3;
}
.dropzone {
  box-sizing: border-box;
  width: 100%;
  height: 10em;
  border: 2px dashed #ccc;
  display: flex;
  justify-content: center;
  align-items: center;
  font-size: 24px;
  cursor: pointer;
  padding: 1em;
  margin-bottom: 1em;
}
.dropzone.disabled {
  cursor: not-allowed;
}
.dropzone.drag-over {
  background-color: pink;
}
.image-container img {
  margin-bottom: 10px;
  max-width: 100%;
}
textarea {
  width: 100%;
  height: 10em;
  margin-bottom: 20px;
  box-sizing: border-box;
}
.full-document-section {
  display: none;
  margin-bottom: 20px;
}
.actions {
  display: flex;
  gap: 0.5rem;
  align-items: center;
  margin: 0.5rem 0 1rem;
}
.actions button {
  padding: 0.5rem 0.75rem;
  font-size: 14px;
  cursor: pointer;
}
</style>
</head>
<body>
  <h1>OCR PDFs and images directly in your browser</h1>
  <p>This tool runs entirely in your browser. No files are uploaded to a server.</p>
  <p>It uses <a href="https://tesseract.projectnaptha.com/">Tesseract.js</a> for OCR and <a href="https://mozilla.github.io/pdf.js/">PDF.js</a> to convert PDFs into images.</p>
  <p><label>Language: <select id="id_language"><option>ENG</option></select></label></p>
  <input type="file" id="fileInput" accept=".pdf,.jpg,.jpeg,.png,.gif" style="display: none;" />
  
  <div class="actions">
    <button id="loadExampleButton" type="button" aria-label="Load example PDF">Load example PDF</button>
  </div>

  <div class="dropzone" id="dropzone">
    Drag and drop a PDF, JPG, PNG, or GIF file here or click to select a file
  </div>
  <div class="full-document-section" id="fullDocumentSection">
    <h2>Full document</h2>
    <textarea class="full-document" id="fullDocument"></textarea>
    <h2>Pages</h2>
  </div>
  <div class="image-container"></div>
  <p><a href="https://github.com/simonw/tools/blob/main/ocr.html">Source code</a>. <a href="https://simonwillison.net/2024/Mar/30/ocr-pdfs-images/">How I built this</a>.</p>

<script>
const desiredWidth = 1000;
const dropzone = document.getElementById('dropzone');
const fileInput = document.getElementById('fileInput');
const imageContainer = document.querySelector('.image-container');
const fullDocumentTextarea = document.getElementById('fullDocument');
const fullDocumentSection = document.getElementById('fullDocumentSection');
const languageSelect = document.getElementById('id_language');
const loadExampleButton = document.getElementById('loadExampleButton');

const EXAMPLE_URL = 'https://static.simonwillison.net/static/cors-allow/2025/attention-all-you-need.pdf';
const DROPZONE_DEFAULT_TEXT = dropzone.innerText;

let fileSelectionAllowed = true;

const LANGUAGES = {
  "afr": "Afrikaans",
  "amh": "Amharic",
  "ara": "Arabic",
  "asm": "Assamese",
  "aze": "Azerbaijani",
  "aze_cyrl": "Azerbaijani - Cyrillic",
  "bel": "Belarusian",
  "ben": "Bengali",
  "bod": "Tibetan",
  "bos": "Bosnian",
  "bul": "Bulgarian",
  "cat": "Catalan; Valencian",
  "ceb": "Cebuano",
  "ces": "Czech",
  "chi_sim": "Chinese - Simplified",
  "chi_tra": "Chinese - Traditional",
  "chr": "Cherokee",
  "cym": "Welsh",
  "dan": "Danish",
  "deu": "German",
  "dzo": "Dzongkha",
  "ell": "Greek, Modern (1453-)",
  "eng": "English",
  "enm": "English, Middle (1100-1500)",
  "epo": "Esperanto",
  "est": "Estonian",
  "eus": "Basque",
  "fas": "Persian",
  "fin": "Finnish",
  "fra": "French",
  "frk": "German Fraktur",
  "frm": "French, Middle (ca. 1400-1600)",
  "gle": "Irish",
  "glg": "Galician",
  "grc": "Greek, Ancient (-1453)",
  "guj": "Gujarati",
  "hat": "Haitian; Haitian Creole",
  "heb": "Hebrew",
  "hin": "Hindi",
  "hrv": "Croatian",
  "hun": "Hungarian",
  "iku": "Inuktitut",
  "ind": "Indonesian",
  "isl": "Icelandic",
  "ita": "Italian",
  "ita_old": "Italian - Old",
  "jav": "Javanese",
  "jpn": "Japanese",
  "kan": "Kannada",
  "kat": "Georgian",
  "kat_old": "Georgian - Old",
  "kaz": "Kazakh",
  "khm": "Central Khmer",
  "kir": "Kirghiz; Kyrgyz",
  "kor": "Korean",
  "kur": "Kurdish",
  "lao": "Lao",
  "lat": "Latin",
  "lav": "Latvian",
  "lit": "Lithuanian",
  "mal": "Malayalam",
  "mar": "Marathi",
  "mkd": "Macedonian",
  "mlt": "Maltese",
  "msa": "Malay",
  "mya": "Burmese",
  "nep": "Nepali",
  "nld": "Dutch; Flemish",
  "nor": "Norwegian",
  "ori": "Oriya",
  "pan": "Panjabi; Punjabi",
  "pol": "Polish",
  "por": "Portuguese",
  "pus": "Pushto; Pashto",
  "ron": "Romanian; Moldavian; Moldovan",
  "rus": "Russian",
  "san": "Sanskrit",
  "sin": "Sinhala; Sinhalese",
  "slk": "Slovak",
  "slv": "Slovenian",
  "spa": "Spanish; Castilian",
  "spa_old": "Spanish; Castilian - Old",
  "sqi": "Albanian",
  "srp": "Serbian",
  "srp_latn": "Serbian - Latin",
  "swa": "Swahili",
  "swe": "Swedish",
  "syr": "Syriac",
  "tam": "Tamil",
  "tel": "Telugu",
  "tgk": "Tajik",
  "tgl": "Tagalog",
  "tha": "Thai",
  "tir": "Tigrinya",
  "tur": "Turkish",
  "uig": "Uighur; Uyghur",
  "ukr": "Ukrainian",
  "urd": "Urdu",
  "uzb": "Uzbek",
  "uzb_cyrl": "Uzbek - Cyrillic",
  "vie": "Vietnamese",
  "yid": "Yiddish"
}

// Populate the languages select box
while (languageSelect.firstChild) {
  languageSelect.removeChild(languageSelect.firstChild);
}

for (const code of Object.values(Tesseract.languages)) {
  const name = LANGUAGES[code];
  const option = document.createElement('option');
  option.value = code;
  option.textContent = name || code;
  if (option.value == 'eng') {
    option.selected = true;
  }
  languageSelect.appendChild(option);
}

function showFullDocument() {
  // Only shows if there are multiple populated textareas
  const populatedTextareas = Array.from(
    document.querySelectorAll('.image-container textarea')
  ).filter(ta => ta.value.trim().length);
  if (populatedTextareas.length > 1) {
    fullDocumentTextarea.value = populatedTextareas.map(ta => ta.value.trim()).join("\n\n");
    fullDocumentSection.style.display = 'block';
  } else {
    fullDocumentTextarea.value = '';
    fullDocumentSection.style.display = 'none';
  }
}

function setTextarea(ta, text) {
  ta.value = text.trim();
  // Set textarea height to fit content
  ta.style.height = 'auto';
  ta.style.height = (ta.scrollHeight + 5) + 'px';
}

dropzone.addEventListener('dragover', handleDragOver);
dropzone.addEventListener('dragleave', handleDragLeave);
dropzone.addEventListener('drop', handleDrop);
dropzone.addEventListener('click', handleClick);
loadExampleButton.addEventListener('click', loadExamplePDF);

async function handleDragOver(event) {
  event.preventDefault();
  if (fileSelectionAllowed) {
    dropzone.classList.add('drag-over');
  }
}

async function handleDragLeave(event) {
  event.preventDefault();
  if (fileSelectionAllowed) {
    dropzone.classList.remove('drag-over');
  }
}

async function handleDrop(event) {
  event.preventDefault();
  if (fileSelectionAllowed) {
    dropzone.classList.remove('drag-over');
    const file = event.dataTransfer.files[0];
    fileInput.files = event.dataTransfer.files;
    processFile(file);
  }
}

async function handleClick() {
  if (fileSelectionAllowed) {
    fileInput.click();
  }
}

fileInput.addEventListener('change', (event) => {
  const file = event.target.files[0];
  processFile(file);
});

// NEW: Load example PDF via fetch()
async function loadExamplePDF() {
  if (!fileSelectionAllowed) return;
  const previousText = dropzone.innerText;
  dropzone.innerText = 'Fetching example PDF...';
  dropzone.classList.add('disabled');
  fileSelectionAllowed = false;

  try {
    const response = await fetch(EXAMPLE_URL, { mode: 'cors' });
    if (!response.ok) {
      throw new Error(`HTTP ${response.status}`);
    }
    const blob = await response.blob();
    const pdfBlob = blob.type === 'application/pdf' ? blob : new Blob([blob], { type: 'application/pdf' });
    const file = new File([pdfBlob], 'attention-is-all-you-need.pdf', { type: 'application/pdf' });

    // Mirror the fetched file into the hidden <input> for consistency
    try {
      const dt = new DataTransfer();
      dt.items.add(file);
      fileInput.files = dt.files;
    } catch (e) {
      // DataTransfer may not be available in some browsers; safe to ignore.
    }

    await processFile(file);
  } catch (err) {
    console.error('Failed to load example PDF:', err);
    alert('Sorry—failed to load the example PDF. Please check your connection and try again.');
  } finally {
    // Ensure UI returns to default label after processing
    dropzone.innerText = DROPZONE_DEFAULT_TEXT;
    dropzone.classList.remove('disabled');
    fileSelectionAllowed = true;
  }
}

async function processFile(file) {
  const worker = await Tesseract.createWorker(languageSelect.value);
  fullDocumentTextarea.value = '';
  fullDocumentSection.style.display = 'none';
  imageContainer.innerHTML = '';
  const originalText = dropzone.innerText;
  dropzone.innerText = 'Processing file...';
  dropzone.classList.add('disabled');
  fileSelectionAllowed = false;

  if (file.type === 'application/pdf') {
    const { numPages, imageIterator } = await convertPDFToImages(file);
    let done = 0;
    dropzone.innerText = `Processing ${numPages} page${numPages > 1 ? 's' : ''}`;
    for await (const { imageURL } of imageIterator) {
      const ta = await displayImage(imageURL);
      const { text } = await ocrImage(worker, imageURL);
      setTextarea(ta, text);
      showFullDocument();
      done += 1;
      dropzone.innerText = `Done ${done} of ${numPages}`;
    }
  } else {
    const imageURL = URL.createObjectURL(file);
    const ta = await displayImage(imageURL);
    const { text } = await ocrImage(worker, imageURL);
    setTextarea(ta, text);
    showFullDocument();
  }

  await worker.terminate();
  dropzone.innerText = originalText;
  dropzone.classList.remove('disabled');
  fileSelectionAllowed = true;
}

async function displayImage(imageURL) {
  const imgElement = document.createElement('img');
  imgElement.src = imageURL;
  imageContainer.appendChild(imgElement);

  const altTextarea = document.createElement('textarea');
  altTextarea.classList.add('textarea-alt');
  altTextarea.placeholder = 'OCRing image...';
  imageContainer.appendChild(altTextarea);

  return altTextarea;
}

async function rerunOCR() {
  const textareas = document.querySelectorAll('.image-container textarea');
  const images = document.querySelectorAll('.image-container img');

  if (!textareas.length) {
    return;
  }

  // Blank all the textareas
  Array.from(textareas).forEach(ta => ta.value = '');
  showFullDocument();

  const numImages = images.length;
  let done = 0;
  dropzone.innerText = `Processing ${numImages} image${numImages > 1 ? 's' : ''}`;

  const worker = await Tesseract.createWorker(languageSelect.value);

  for (let i = 0; i < numImages; i++) {
    const imageURL = images[i].src;
    const ta = textareas[i];
    ta.placeholder = 'OCRing image...';
    const { text } = await ocrImage(worker, imageURL);
    setTextarea(ta, text);
    showFullDocument();
    done += 1;
    dropzone.innerText = `Done ${done} of ${numImages}`;
  }
  await worker.terminate();
}


document.addEventListener('paste', (event) => {
  const items = (event.clipboardData || event.originalEvent.clipboardData).items;
  const images = Array.from(items).filter(item => item.type.indexOf('image') !== -1);
  if (images.length) {
    processFile(images[0].getAsFile());
  }
});

async function convertPDFToImages(file) {
  // returns { numPages, imageIterator }
  const pdf = await pdfjsLib.getDocument(URL.createObjectURL(file)).promise;
  const numPages = pdf.numPages;
  async function* images() {
    for (let i = 1; i <= numPages; i++) {
      try {
        const page = await pdf.getPage(i);
        const viewport = page.getViewport({ scale: 1 });
        const canvas = document.createElement('canvas');
        const context = canvas.getContext('2d');
        canvas.width = desiredWidth;
        canvas.height = (desiredWidth / viewport.width) * viewport.height;
        const renderContext = {
          canvasContext: context,
          viewport: page.getViewport({ scale: desiredWidth / viewport.width }),
        };
        await page.render(renderContext).promise;
        const imageURL = canvas.toDataURL('image/jpeg', 0.8);
        yield { imageURL };
      } catch (error) {
        console.error(`Error rendering page ${i}:`, error);
      }
    }
  }
  return {numPages: numPages, imageIterator: images()};
}

async function ocrImage(worker, imageUrl) {
  const {
    data: { text },
  } = await worker.recognize(imageUrl);
  return { text };
}

// Update URL bar to match language select
languageSelect.addEventListener('change', async (event) => {
  let newUrl = window.location.pathname;
  let language = event.target.value;
  if (language != 'eng') {
    newUrl += '?language=' + language;
  }
  window.history.pushState({ path: newUrl }, '', newUrl);
  await rerunOCR();
});

function setLanguageFromQueryString() {
  const params = new URLSearchParams(window.location.search);
  let value = params.get('language');
  if (!value) {
    value = 'eng';
  }
  languageSelect.value = value;
}

// Set the select box value when the page loads
window.addEventListener('load', setLanguageFromQueryString);

window.addEventListener('popstate', (event) => {
  setLanguageFromQueryString();
});
</script>
</body>
</html>
